**adjust directory names so that they correspond to where data files are saved**

clear all
set more off

**data cleaning*****************************************************************

* clean school data

cd "D:\Documents\Data and Dofiles\school_data"
do "clean_school_data"
cd "D:\Documents\Data and Dofiles"
save "cleaned_school", replace
clear 

* clean tax data 

cd "D:\Documents\Data and Dofiles\tax_data"
do "clean_tax_data"
cd "D:\Documents\Data and Dofiles"
save "cleaned_tax", replace
clear

* clean population data

cd "D:\Documents\Data and Dofiles\population_election_data"
do "clean_population_election_data"
cd "D:\Documents\Data and Dofiles"
save "cleaned_population_election", replace
clear

**data merging******************************************************************

* merge tax data and population data (both data are on district-level) 

use "cleaned_tax"

merge 1:1 province district year using "cleaned_population_election"
sort province district year

save "temp_1", replace

* replicate district-level data to all types of schools within the same districts

gen school_type=1

forval i=2/4 {
append using "temp_1"
replace school_type=`i' if school_type == .
}

drop if year < 2011 & school_type > 2 //there are only school types 1 & 2 prior to 2011 

* generate sample unit id (composed of district code and school type)

gen id=10*district+school_type
order id province district school_type year
sort id province district school_type year

* replace missing values

replace educ_tax=educ_tax[_n-1] if educ_tax == . & id == id[_n-1]
replace gross_tax=gross_tax[_n-1] if gross_tax == . & id == id[_n-1]
gen ln_educ_tax=ln(educ_tax)
gen ln_gross_tax=ln(gross_tax)

* merge with school data 

merge 1:1 id province district school_type year using "cleaned_school", gen(_merge2) 

* drop and generate new variables

drop if missing(student) | missing(schools)

gen schooltype=1 if school_type == 3
replace schooltype=2 if school_type == 4
replace schooltype=3 if school_type == 1
replace schooltype=4 if school_type == 2

gen metropolitan = 0 if province > 30
replace metropolitan = 1 if province < 30 

**creating variables for diff-in-diff analyses**********************************

* drop districts not in either treatment or control group

drop if district == 3380 | district == 2231 | district == 3742| district ==3241 | district == 3601

* generate new var #1: treated districts

gen treat_1=0 //implemented policy in 2009 (14 districts)
replace treat_1=1 if district == 3139 ///
| district == 3836 | district == 3835 | district == 3840 | district == 3831 ///
| district == 3537 | district == 3533 | district == 3536 | district == 3531 ///
| district == 3534 | district == 3538 | district == 3532 | district == 3535 ///
| district == 3647

gen treat_2=0 //2011 (7)
replace treat_2=1 if district == 2332 | district == 3235 ///
| district == 3838 | district == 3837 | district == 3832 | district == 3833 | district == 3834 ///

gen treat_3=0 //2012 (18)
replace treat_3=1 if district == 3232 ///
| district == 3811 | district == 3809 | district == 3805 | district == 3810 ///
| district == 3807 | district == 3806 | district == 3803 | district == 3808 ///
| district == 3648 | district == 3644 | district == 3636 | district == 3633 ///
| district == 3642 | district == 3641 | district == 3645 | district == 3601 | district == 3646

gen treat_4=0 //2013 (27)
replace treat_4=1 if district == 3118 ///
| id == 32312 | id == 32012 | id == 32372 | id == 32412 | id == 32362 | id == 32372 ///
| id == 32042 | id == 32032 | id == 32332 | id == 32022 | id == 32052 | district == 3239 ///
| district == 3638 | district == 3637 | district == 3640 | district == 3632 ///
| district == 3631 | district == 3647 | district == 3635 | district == 3643 ///
| district == 3606 | district == 3639 | district == 3603 | district == 3602 | district == 3604 ///
| district == 3731

gen treat_district=0
replace treat_district=1 if treat_1 == 1 | treat_2 == 1 | treat_3 == 1 | treat_4 == 1

* generate new var #2: treated districts in treated years 

gen treat_dd=0
replace treat_dd=1 if treat_1 == 1 & year >= 2009
replace treat_dd=1 if treat_2 == 1 & year >= 2011
replace treat_dd=1 if treat_3 == 1 & year >= 2012
replace treat_dd=1 if treat_4 == 1 & year >= 2013

* generate new var #3: dummy variables for year and district

quietly tab year, gen(year_dum)
quietly tab district, gen(dist_dum) 

* generate new var #4: dummy variables for treated years 

gen after_policy1=1 if year >= 2009
replace after_policy1=0 if year < 2009
gen after_policy2=1 if year >= 2011
replace after_policy2=0 if year < 2011
gen after_policy3=1 if year >= 2012
replace after_policy3=0 if year < 2012
gen after_policy4=1 if year >= 2013
replace after_policy4=0 if year < 2013

save "complete_data(2005-2014)", replace

**end of data merge*************************************************************
